
# coding: utf-8

import csv
import numpy as np
import pandas as pd

src_path = 'data/train.csv'

dst_path = 'data/train_tiny.csv'

fe_dst_path = 'data/train_tiny_new.csv'


def sample_data(sample_pct,src,dst):
    np.random.seed(99)

    f=csv.writer(open(dst, 'wb'))
    for i, row in enumerate(csv.reader(open(src))):
        if i == 0:
            f.writerow(row)
        else:
            if np.random.uniform(0, 1, 1) < sample_pct:
                f.writerow(row)


def check_whether_two_fe_consistent(fe1, fe2):
    """Determine whether two features are consistent"""
    fe_dict = {}
    element_num = fe1.size
    for i in range(element_num):
        fe1_val, fe2_val  =  fe1[i], fe2[i]
        if fe_dict.has_key(fe1_val):
            if fe2_val != fe_dict[fe1_val]:
                return False
        else:
            fe_dict[fe1_val] = fe2_val
    return True


#对原始数据进行下采样
sample_data(0.01/40, src_path, dst_path)
all_data = pd.read_csv(dst_path)

categorical_features = all_data.select_dtypes(include = ["object","int64"]).columns
categorical_features_num = categorical_features.size
drop_list = []
for i in range(categorical_features_num):
    for j in range(i+1, categorical_features_num):
        if(True == check_whether_two_fe_consistent(all_data[categorical_features[i]], all_data[categorical_features[j]])):
            drop_list.append(categorical_features[j])
            print categorical_features[i], categorical_features[j]

all_data.drop(drop_list, inplace = True, axis = 1)


all_data["day_hour"] = np.round(all_data["hour"] % 100)
all_data.drop(["hour"], inplace = True, axis = 1)

all_data['app_id_cat'] = 0
all_data.ix[all_data.app_id.values=='ecad2386','app_id_cat'] = 1

all_data['device_id_cat'] = 0
all_data.ix[all_data.device_id.values=='a99f214a','device_id_cat'] = 1

all_data.drop(['app_id', 'device_id'], inplace = True, axis = 1)


all_data.to_csv(fe_dst_path, index=False)

